import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import torch
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
imports
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def embedding(Graph):
# Graph -> X (feature)
= list(Graph.edges)
_edgs = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
subGraph list(set(Graph.nodes) - set(subGraph.nodes)))
subGraph.add_nodes_from(= AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
embedded = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
X # Graph -> y (label)
= np.array(list(nx.get_edge_attributes(Graph, "label").values()))
y return X,y
def anal(df):
= build_graph_bipartite(df)
Graph = embedding(Graph)
X,XX,y,yy = RandomForestClassifier(n_estimators=100, random_state=42)
lrnr
lrnr.fit(X,y)= lrnr.predict(XX)
yyhat = pd.DataFrame({
df 'acc':[sklearn.metrics.accuracy_score(yy,yyhat)],
'pre':[sklearn.metrics.precision_score(yy,yyhat)],
'rec':[sklearn.metrics.recall_score(yy,yyhat)],
'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
) return df
def our_sampling1(df):
= set(df.query('is_fraud==1').cc_num.tolist())
cus_list return df.query("cc_num in @ cus_list")
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
시도
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
(214520, 22)
= down_sample_textbook(df02)
df50 df50.shape
(12012, 22)
12012*12012
144288144
= df50.reset_index() df50
= len(df50) N
tr/test
= sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
round(5), df50_test.is_fraud.mean().round(5) df50_tr.is_fraud.mean().
(0.49828, 0.50516)
df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
= np.concatenate((np.full(9009, True), np.full(3003, False)))
train_mask = np.concatenate((np.full(9009, False), np.full(3003, True)))
test_mask print("Train Mask:", train_mask)
print("Test Mask:", test_mask)
Train Mask: [ True True True ... False False False]
Test Mask: [False False False ... True True True]
train_mask.shape, test_mask.shape
((12012,), (12012,))
sum(), test_mask.sum() train_mask.
(9009, 3003)
= pd.concat([df50_tr, df50_test]) df50_com
= df50_com.reset_index() df50_com
aj_matrix
# edge_index_list2_com = []
# for i in range(N):
# for j in range(N):
# if df50_com['cc_num'][i] != df50_com['cc_num'][j]:
# edge = 0
# else:
# edge = 1
# edge_index_list2_com.append([i, j, edge])
#np.save('edge_index_list2_50_com.npy', edge_index_list2_com)
= np.load('edge_index_list2_50_com.npy') edge_index_list2_com
# edge_index_list2_com
array([[ 0, 0, 1],
[ 0, 1, 0],
[ 0, 2, 0],
...,
[12011, 12009, 0],
[12011, 12010, 0],
[12011, 12011, 1]])
edge_index_list2_com.shape
(144288144, 3)
= 12012 num_nodes
= np.zeros((num_nodes, num_nodes)) aj_matrix
aj_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
for i, j ,edge in edge_index_list2_com:
= edge aj_matrix[i][j]
aj_matrix
array([[1., 0., 0., ..., 0., 0., 0.],
[0., 1., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 1., 0., 0.],
[0., 0., 0., ..., 0., 1., 0.],
[0., 0., 0., ..., 0., 0., 1.]])
# aj_matrix.shape
(12012, 12012)
# np.save('aj_matrix.npy', aj_matrix)
# aj_matrix = np.load('aj_matrix.npy')
# aj_matrix
array([[1., 0., 0., ..., 0., 0., 0.],
[0., 1., 0., ..., 0., 0., 0.],
[0., 0., 1., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 1., 0., 0.],
[0., 0., 0., ..., 0., 1., 0.],
[0., 0., 0., ..., 0., 0., 1.]])
weigt matirx
# edge_index_list = []
# for i in range(N):
# for j in range(N):
# time_difference = (df50_com['trans_date_trans_time'][i] - df50_com['trans_date_trans_time'][j]).total_seconds()
# edge_index_list.append([i, j, time_difference])
# np.save('edge_index_list_50_com.npy', edge_index_list)
= np.load('edge_index_list_50_com.npy') edge_index_list
5] edge_index_list[:
array([[ 0.000000e+00, 0.000000e+00, 0.000000e+00],
[ 0.000000e+00, 1.000000e+00, -2.030190e+07],
[ 0.000000e+00, 2.000000e+00, -2.841396e+07],
[ 0.000000e+00, 3.000000e+00, -2.383788e+07],
[ 0.000000e+00, 4.000000e+00, -2.687796e+07]])
= np.array(edge_index_list)
edge_index 2] = np.abs(edge_index[:,2])
edge_index[:,= edge_index[:,2].mean()
theta theta
12238996.895508753
2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index[:, edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 1.00000000e+00, 1.90369587e-01],
[0.00000000e+00, 2.00000000e+00, 9.81172367e-02],
...,
[1.20110000e+04, 1.20090000e+04, 9.25720620e-01],
[1.20110000e+04, 1.20100000e+04, 5.15585903e-01],
[1.20110000e+04, 1.20110000e+04, 0.00000000e+00]])
w_matrix로 바꾸려고 하니까 형식이 [i][j]가 맞지 않는다.!
# # 출력 형식 변경
# np.set_printoptions(formatter={'int': '{:d}'.format})
# # 원래 출력 형식으로 복원
# np.set_printoptions(formatter=None)
= np.zeros((num_nodes, num_nodes)) w_matrix
w_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
for i, j ,time_difference in edge_index:
= int(i), int(j)
i, j = time_difference w_matrix[i][j]
w_matrix
array([[0. , 0.19036959, 0.09811724, ..., 0.29671829, 0.14162023,
0.27467824],
[0.19036959, 0. , 0.51540395, ..., 0.6415836 , 0.74392254,
0.69306396],
[0.09811724, 0.51540395, 0. , ..., 0.33067472, 0.69281937,
0.3572079 ],
...,
[0.29671829, 0.6415836 , 0.33067472, ..., 0. , 0.4772885 ,
0.92572062],
[0.14162023, 0.74392254, 0.69281937, ..., 0.4772885 , 0. ,
0.5155859 ],
[0.27467824, 0.69306396, 0.3572079 , ..., 0.92572062, 0.5155859 ,
0. ]])
w_matrix.shape
(12012, 12012)
'w_matrix.npy', w_matrix) np.save(
# np.save('edge_index_list_plus.npy', edge_index_list_plus)
= np.load('edge_index_list_plus.npy') edge_index_list_plus
= np.array(edge_index_list_plus) edge_index
edge_index.shape
(144288144, 3)
edge_index
array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
[0.0000e+00, 1.0000e+00, 0.0000e+00],
[0.0000e+00, 2.0000e+00, 0.0000e+00],
...,
[1.2011e+04, 1.2009e+04, 0.0000e+00],
[1.2011e+04, 1.2010e+04, 0.0000e+00],
[1.2011e+04, 1.2011e+04, 0.0000e+00]])
2] = np.abs(edge_index[:,2])
edge_index[:,= edge_index[:,2].mean()
theta theta
10973.519989002007
2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index[:, edge_index
array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
[0.0000e+00, 1.0000e+00, 0.0000e+00],
[0.0000e+00, 2.0000e+00, 0.0000e+00],
...,
[1.2011e+04, 1.2009e+04, 0.0000e+00],
[1.2011e+04, 1.2010e+04, 0.0000e+00],
[1.2011e+04, 1.2011e+04, 0.0000e+00]])
= edge_index.tolist()
edge_index_list_updated = np.array(edge_index_list_updated)[:,2].mean() mm
= [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm] selected_edges
= torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
edge_index_selected.shape
torch.Size([2, 51392])
'edge_index_selected.npy', edge_index_selected) np.save(
pyg lesson6 따라하기
data설정(x, edge_index, y)
= torch.tensor([df50_com['amt']], dtype=torch.float).reshape(-1,1) x
x
tensor([[921.2400],
[698.2800],
[220.5600],
...,
[ 17.9700],
[ 7.5800],
[824.9900]])
= torch.tensor(df50_com['is_fraud'],dtype = torch.int64) y
y
tensor([1, 1, 0, ..., 1, 0, 1])
import torch_geometric
= torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y)
data
#train_mask = train_mask, test_mask = test_mask
data
Data(x=[12012, 1], edge_index=[2, 51392], y=[12012])
GCNConv
= torch_geometric.nn.GCNConv(1,4)
gconv gconv
GCNConv(1, 4)
gconv(data.x, data.edge_index)
tensor([[-5.1237e+02, 5.3152e+02, -2.9626e+01, 5.3703e+02],
[-4.2507e+02, 4.4096e+02, -2.4578e+01, 4.4553e+02],
[-1.9991e+02, 2.0738e+02, -1.1559e+01, 2.0953e+02],
...,
[-3.8459e+02, 3.9897e+02, -2.2238e+01, 4.0310e+02],
[-6.8703e+00, 7.1271e+00, -3.9725e-01, 7.2010e+00],
[-5.2357e+02, 5.4314e+02, -3.0273e+01, 5.4877e+02]],
grad_fn=<AddBackward0>)
list(gconv.parameters())
[Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True),
Parameter containing:
tensor([[-0.9064],
[ 0.9403],
[-0.0524],
[ 0.9500]], requires_grad=True)]
= list(gconv.parameters())
_,W W
Parameter containing:
tensor([[-0.9064],
[ 0.9403],
[-0.0524],
[ 0.9500]], requires_grad=True)
= torch.tensor(aj_matrix, dtype=torch.float32)
A = A + torch.eye(12012) Atilde
Atilde
tensor([[2., 0., 0., ..., 0., 0., 0.],
[0., 2., 0., ..., 0., 0., 0.],
[0., 0., 2., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 2., 0., 0.],
[0., 0., 0., ..., 0., 2., 0.],
[0., 0., 0., ..., 0., 0., 2.]])
엇? 자기자신은 = 0 을 .. 넣는게 여기든가!
@data.x@W.T/3, gconv(data.x,data.edge_index) Atilde
(tensor([[-2327.1099, 2414.0901, -134.5556, 2439.1140],
[-2175.9143, 2257.2434, -125.8134, 2280.6416],
[ -344.8129, 357.7009, -19.9374, 361.4088],
...,
[ -647.5668, 671.7708, -37.4429, 678.7343],
[-2294.9517, 2380.7297, -132.6962, 2405.4080],
[-3068.6165, 3183.3123, -177.4302, 3216.3098]],
grad_fn=<DivBackward0>),
tensor([[-5.1237e+02, 5.3152e+02, -2.9626e+01, 5.3703e+02],
[-4.2507e+02, 4.4096e+02, -2.4578e+01, 4.4553e+02],
[-1.9991e+02, 2.0738e+02, -1.1559e+01, 2.0953e+02],
...,
[-3.8459e+02, 3.9897e+02, -2.2238e+01, 4.0310e+02],
[-6.8703e+00, 7.1271e+00, -3.9725e-01, 7.2010e+00],
[-5.2357e+02, 5.4314e+02, -3.0273e+01, 5.4877e+02]],
grad_fn=<AddBackward0>))
A를 선택하는거에 있어서 생각해보니 ,,,,,,, 잘못했다! 계속 그냥. . edge만 하는데 GConv에서는 weight를 통해서 edge를 골랐는데 .. 그럼 여기서도 그 mm값을 통해서 구해야할거같은데……이건 어떻게 계산하는 거지?